home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /*
- * Building an index with a Unix shell interface.
- *
- * -brewster 6/90
- */
-
- /* to do:
- * done: make incremental indexing not index things that are already index
- * add extra arg -register that will send in description of the server to
- * the directory of servers.
- * done: create a source struct in the .src file
- * make it continuously index to keep itself uptodate.
- *
- */
-
- #include <string.h>
- #include <sys/types.h>
- #include "irdirent.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
- #include "panic.h"
- #include "ircfiles.h"
- #include "version.h"
- #include "irext.h"
-
- #define INDEXER_DATE "Fri Sep 13 1991"
-
- /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */
-
- extern boolean indexingForBeta;
-
- char *log_file_name = NULL;
- FILE *logfile;
-
- /* This is the MAIN for building an index.
- */
- void
- main(argc, argv)
- int argc;
- char *argv[];
- {
- database* db = NULL;
- long argc_copy = argc;
- char **argv_copy = argv;
- char *next_argument;
- char index_filename[1000];
- boolean (*separator_function)();
- void (*header_function)();
- void (*finish_header_function)();
- long (*date_function)();
- boolean adding_to_existing_index = FALSE;
- boolean traverse_directory = FALSE;
- long memory_to_use = -1;
- boolean check_for_text_file = FALSE;
- boolean register_database = FALSE;
- boolean export_database = FALSE;
- char *typename = NULL; /* this is what the user said */
- char *type = NULL; /* this is the type stored with the db */
- long start_of_filenames;
- long hashtable_size = 1L<<16;
- long flush_after_n_words = 500000;
- char *command_name;
-
- next_argument = next_arg(&argc, &argv);
- separator_function = NULL; /* initailize to nil */
- header_function = NULL;
- date_function = NULL;
- finish_header_function = NULL;
- type = "TEXT"; /* default to text */
- typename = "Text";
-
- command_name = next_argument;
-
- logfile = stderr;
-
- if(0 == argc){ /* no args */
- printf("Usage: %s [-d index_filename]\n", next_argument);
- printf(" [-a] /* adding to an existing index, otherwise it erases the index */\n");
- printf(" [-r] /* recursively index subdirectories */\n");
- printf(" [-mem mbytes] /* number of megabytes to run this in */\n");
- printf(" [-register] /* registers the database with the directory of servers.\n");
- printf(" This should be done with care. */\n");
- printf(" [-export] /* uses short dbname and port 210 */\n");
- printf(" [-v] /* print the version of the software */\n");
- printf(" [-t /* format of the file. if none then each file is a document */\n");
- printf(" text /* simple text files, this is the default */\n");
- printf(" | groliers /* groliers encyclopedia format */\n");
- printf(" | mail /* unix mail and netnews format */\n");
- printf(" | rmail /* gnu rmail */\n");
- printf(" | mail_or_rmail /* mail or rmail or both */\n");
- printf(" | mail_digest /* standard internet mail digest format */\n");
- printf(" | mh_bboard /* MH bulletin board format */\n");
- printf(" | netnews /* netnews format */\n");
- printf(" | catalog /* Thinking Machines library catalog */\n");
- printf(" | bio /* biology abstract format */\n");
- printf(" | cmapp /* CM applications from Hypercard */\n");
- printf(" | pict /* pict files, only indexes the filename */\n");
- printf(" | gif /* gif files, only indexes the filename */\n");
- printf(" | tiff /* tiff files, only indexes the filename */\n");
- printf(" | jargon /* the jargon file (the hackers dictionary) */\n");
- printf(" | server /* server structures for the dir of servers */\n");
- printf(" | objc /* objective-C .h and .m files */\n");
- printf(" | irg /* internet resource guide */\n");
- printf(" | dash /* entries separated by a row of dashes */\n");
- printf(" | one_line /* each line is a document */\n");
- printf(" | para /* paragraphs separated by blank lines */\n");
- printf(" ] filename filename ...\n");
- exit(0);
- }
- #ifdef THINK_C
- strcpy(index_filename, "wais:System Folder:wais-index:index");
- #else
- strcpy(index_filename, "index"); /* in the current directory */
- #endif /* THINK_C */
-
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- printf("No arguments specified\n");
- exit(0);
- }
- while('-' == next_argument[0]){
- /* then we have an argument to process */
- if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */
- (0 == strcmp("-d", next_argument))){
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- printf("Expected filename for the index\n");
- exit(0);
- }
- strcpy(index_filename, next_argument);
- }
- else if(0 == strcmp("-a", next_argument)){
- adding_to_existing_index = true;
- }
- else if(0 == strcmp("-r", next_argument)){
- traverse_directory = true;
- }
- else if(0 == strcmp("-register", next_argument)){
- register_database = true;
- }
- else if(0 == strcmp("-export", next_argument)){
- export_database = true;
- }
- else if(0 == strcmp("-v", next_argument)){
- printf("%s: %s\n", command_name, VERSION, INDEXER_DATE);
- }
- else if(0 == strcmp("-mem", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number for the amount of memory to use");
- memory_to_use = atol(next_argument);
- if(memory_to_use < 1)
- panic("The -mem argument should not be less than 1");
- if(memory_to_use > 200)
- printf("Warning: The -mem parameter was %ld Mbytes. That is a large number of mega bytes in current machines\n", memory_to_use);
- }
- else if(0 == strcmp("-cm", next_argument)){
- /* this is an undocumented argument to help use this to
- front end the CM application */
- indexingForBeta = true;
- }
- else if(0 == strcmp("-t", next_argument)){
- /* then we have a specialized file */
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a file type");
- if(0 == strcmp("groliers", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = groliers_separator_function;
- header_function = groliers_header_function;
- finish_header_function = groliers_finish_header_function;
- }
- else if(0 == strcmp("objc", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = wobjc_separator_function;
- header_function = wobjc_header_function;
- finish_header_function = wobjc_finish_header_function;
- }
- else if(0 == strcmp("mail", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mail_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mail_or_rmail", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mail_or_rmail_separator;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mail_digest", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mail_digest_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mh_bboard", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mh_bboard_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("rmail", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = rmail_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("netnews", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = NULL;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("catalog", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = catalog_separator_function;
- header_function = catalog_header_function;
- finish_header_function = catalog_finish_header_function;
- }
- else if(0 == strcmp("bio", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = bio_separator_function;
- header_function = bio_header_function;
- finish_header_function = bio_finish_header_function;
- }
- else if(0 == strcmp("cmapp", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = cmapp_separator_function;
- header_function = cmapp_header_function;
- finish_header_function = cmapp_finish_header_function;
- }
- else if(0 == strcmp("pict", next_argument)){
- typename = next_argument;
- type = "PICT";
- }
- else if(0 == strcmp("gif", next_argument)){
- typename = next_argument;
- type = "GIF";
- }
- else if(0 == strcmp("tiff", next_argument)){
- typename = next_argument;
- type = "TIFF";
- }
- else if(0 == strcmp("jargon", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = jargon_separator_function;
- header_function = jargon_header_function;
- finish_header_function = jargon_finish_header_function;
- }
- else if(0 == strcmp("server", next_argument)){
- typename = next_argument;
- type = "WSRC";
- }
- else if(0 == strcmp("text", next_argument)){
- type = "TEXT";
- typename = next_argument;
- check_for_text_file = true;
- }
- else if(0 == strcmp("irg", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = irg_separator_function;
- header_function = irg_header_function;
- finish_header_function = irg_finish_header_function;
- }
- /* dash-separated items , Intro to Algorithms buglist, etc */
- else if(0 == strcmp("dash", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = dash_separator_function;
- header_function = dash_header_function;
- finish_header_function = dash_finish_header_function;
- }
- /* one_line-separated items */
- else if(0 == strcmp("one_line", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = one_line_separator_function;
- header_function = one_line_header_function;
- finish_header_function = one_line_finish_header_function;
- }
- /* blank line-separated items (paragraphs) */
- else if(0 == strcmp("para", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = para_separator_function;
- header_function = para_header_function;
- finish_header_function = para_finish_header_function;
- }
- /* seeker items */
- else if(0 == strcmp("seeker", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = seeker_separator_function;
- header_function = seeker_header_function;
- finish_header_function = seeker_finish_header_function;
- }
- /* rlin items */
- else if(0 == strcmp("rlin", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = rlin_separator_function;
- header_function = rlin_header_function;
- finish_header_function = rlin_finish_header_function;
- }
- else{
- panic("Don't recognize the '%s' type", next_argument);
- }
- }
- else{
- panic("Don't recognize the '%s' option", next_argument);
- }
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- printf("No files specified\n");
- exit(0);
- }
- }
- start_of_filenames = argc_copy - argc - 1;
-
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build");
-
- if(true == adding_to_existing_index){
- db = openDatabase(index_filename, false, false);
- if (db == NULL){ /* does not exist, create one */
- db = openDatabase(index_filename, true, false);
- if (db == NULL)
- panic("unable to open the database");
- }
- }
- else{
- db = openDatabase(index_filename, true, false);
- if (db == NULL)
- panic("unable to open the database");
- }
- { /* set up the memory hashtable */
-
- if(memory_to_use < 0){ /* default */
- /* do nothing */
- }
- else if(memory_to_use <= 2){
- hashtable_size = 1L<<16;
- flush_after_n_words = 100000;
- }
- else if(memory_to_use <= 5){
- hashtable_size = 1L<<16;
- flush_after_n_words = 200000;
- }
- else if(memory_to_use <= 10){
- /* shown to take about 6MB on a sun4, when it is dict limited */
- hashtable_size = 1L<<16;
- flush_after_n_words = 500000;
- }
- else if(memory_to_use <= 20){
- hashtable_size = 1L<<17;
- flush_after_n_words = 1200000;
- }
- else{ /* over 20 Mbytes */
- hashtable_size = 1L<<18;
- flush_after_n_words = 4000000;
- }
- init_add_word(db, hashtable_size, flush_after_n_words);
- }
- while(NULL != next_argument){ /* the first filename is in next_argument already */
- if(directoryp(next_argument)){
- if(traverse_directory){
- index_directory(next_argument,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type, db,
- check_for_text_file,
- adding_to_existing_index);
- }
- }
- else{ /* not a directory */
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Indexing file: %s", next_argument);
- index_text_file(next_argument,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type, db,
- check_for_text_file, adding_to_existing_index);
- }
- next_argument = next_arg(&argc, &argv);
- }
- finished_add_word(db);
- {
- char filename[MAX_FILENAME_LEN + 1];
- if(!probe_file(source_filename(filename, db))){
- char database_name[MAX_FILENAME_LEN];
- write_src_structure(source_filename(filename, db),
- export_database?pathname_name(index_filename):
- truename(index_filename, database_name),
- typename,
- &argv_copy[start_of_filenames],
- argc_copy - start_of_filenames,
- export_database,
- 210L);
- }
- /* write out a description of the server if appropriate */
- if(register_database){
- register_src_structure(source_filename(filename, db));
- }
- }
- closeDatabase(db);
- exit(0);
- }
-